Data collected by exporting chat without media (format txt)
Let us start setting up our enviroment so that we need to install required packages, libraries and so forth.
!pip install emoji
!git clone https://github.com/amueller/word_cloud.git
!pip install wordcloud
!pip install regex
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import regex
import re
import emoji
import plotly.graph_objects as go
from collections import Counter
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
%matplotlib inline
Firstly, we need to parse and tokenize plain text (txt) in order to deploy infromation in meaningful way.
four tokens have been indentified: date, time, author and message: {Date}{Time}{Author}{Message}
Example: {01/01/2020}{9:00}{Javier}{Buenos días chicos, ¿como estáis?} Therefore, parsing and tokenization functions have been developed as follows:
def startDateTime(s):
pattern = '^([0-9]+)(\/)([0-9]+)(\/)([0-9]+) ([0-9]+):([0-9]+)[ ]?-'
result = re.match(pattern,s)
if result:
return True
return False
startDateTime('17/3/19 15:06 - Gabo: Hola')
def findAuthor(s):
s = s.split(":")
if len(s) == 2:
return True
else:
return False
s = '17/3/19 15:06 - Gabo: Hola'
findAuthor(s)
def dataPoint(line):
splitLine = line.split(' - ')
dateTime = splitLine[0]
date, time = dateTime.split(' ')
message = ' '.join(splitLine[1:])
if findAuthor(message):
splitMessage = message.split(': ')
author = splitMessage[0]
message = ' '.join(splitMessage[1:])
else:
author = None
return date, time, author, message
parsedData = []
cnvPath = 'Alicante.txt'
with open(cnvPath, encoding='utf-8') as fp:
fp.readline()
messageBuffer = []
date, time, autor = None, None, None
while True:
line = fp.readline()
if not line:
break
line = line.strip()
if startDateTime(line):
if len(messageBuffer) > 0:
parsedData.append([date, time, author,' '.join(messageBuffer)])
messageBuffer.clear()
date, time, author, message = dataPoint(line)
messageBuffer.append(message)
else:
messageBuffer.append(line)
df = pd.DataFrame(parsedData, columns = ['Date','Time','Author','Message'])
df["Date"] = pd.to_datetime(df["Date"])
df.head(5)
df.info()
Now, participating view is asked so that realize author's activity for each.
df.Author.unique()
None: refers those messages that do not have authors, such as: "Has sido añadido al grupo" or links added in the conversation
Removing None's messages
df = df.dropna()
df.info()
df.Author.unique()
Total_msg = df.shape[0]
print(Total_msg)
Media_msg = df[df['Message']== '<Multimedia omitido>'].shape[0]
print(Media_msg)
def split_count(text):
emoji_lst = []
data = regex.findall(r'\X', text)
for word in data:
if any(char in emoji.UNICODE_EMOJI for char in word):
emoji_lst.append(word)
return emoji_lst
df["emoji"] = df["Message"].apply(split_count)
emojis = sum(df['emoji'].str.len())
print(emojis)
URLpattern = r'(https?://\S+)'
df['urlcount'] = df.Message.apply(lambda x: re.findall(URLpattern, x)).str.len()
links = np.sum(df.urlcount)
print(links)
print("Group Wise Stats")
print("Messages:",Total_msg)
print("Media:",Media_msg)
print("Emojis:",emojis)
print("Links:",links)
Media_msg_df = df[df['Message'] == '<Multimedia omitido>']
msg_df = df.drop(Media_msg_df.index)
msg_df.info()
Let's count the number of letters and words used by each author per message. To do so, 2 new columns have been added to the data frame as it follows:
msg_df['Letter_Count'] = msg_df['Message'].apply(lambda s : len(s))
msg_df['Word_Count'] = msg_df['Message'].apply(lambda s : len(s.split(' ')))
msg_df["MessageCount"]=1
msg_df.head(5)
msg_df["emojicount"]= df['emoji'].str.len()
msg_df.head(5)
l = msg_df.Author.unique()
for i in range(len(l)):
req_df = msg_df[msg_df['Author']== l[i]]
print(f'Stats of {l[i]} -')
print('Messages sent', req_df.shape[0])
words_per_msg = (np.sum(req_df['Word_Count']))/req_df.shape[0]
print('Words per message', words_per_msg)
media = Media_msg_df[Media_msg_df['Author'] == l[i]].shape[0]
print('Media Messages Sent', media)
emojis = sum(req_df['emoji'].str.len())
print('Emojis Sent', emojis)
links = sum(req_df["urlcount"])
print('Links Sent', links)
print()
Unique emojis used
total_emojis_list = list(set([a for b in msg_df.emoji for a in b]))
total_emojis = len(total_emojis_list)
print(total_emojis)
Most used emojis
total_emojis_list = list([a for b in msg_df.emoji for a in b])
emoji_dict = dict(Counter(total_emojis_list))
emoji_dict = sorted(emoji_dict.items(), key=lambda x: x[1], reverse=True)
print(emoji_dict)
emoji_df = pd.DataFrame(emoji_dict, columns=['emoji', 'count'])
emoji_df.head(11)
import plotly.express as px
fig = px.pie(emoji_df, values='count', names='emoji')
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.show()
text = " ".join(review for review in msg_df.Message)
print ("There are {} words in all the messages.".format(len(text)))
stopwords = set(STOPWORDS)
wordcloud = WordCloud(stopwords=stopwords, background_color="black").generate(text)
plt.figure( figsize=(12,6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()
l = msg_df.Author.unique()
for i in range(len(l)):
dummy_df = msg_df[msg_df['Author'] == l[i]]
text = " ".join(review for review in dummy_df.Message)
stopwords = set(STOPWORDS)
print('Author name',l[i])
wordcloud = WordCloud(stopwords=stopwords, background_color="black").generate(text)
plt.figure( figsize=(10,5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()
Day Distribution
def f(i):
l=["Monday", "Tuesday", "Wednesday","Thursday", "Friday", "Saturday","Sunday"]
return l[i];
day_df=pd.DataFrame(msg_df["Message"])
day_df['day_date'] = msg_df['Date'].dt.weekday
day_df['day_date'] = day_df['day_date'].apply(f)
day_df['MessageCount'] = 1
day = day_df.groupby("day_date").sum()
day.reset_index(inplace=True)
fig = px.line_polar(day, r='MessageCount', theta='day_date', line_close=True)
fig.update_traces(fill="toself")
fig.update_layout(
polar=dict(
radialaxis=dict(
visible=True,
)),
showlegend=False
)
fig.show()
date_df = msg_df.groupby("Date").sum()
date_df.reset_index(inplace=True)
fig = px.line(date_df, x="Date", y="MessageCount")
fig.update_xaxes(nticks=20)
fig.show()
date_df["rolling"] = date_df["MessageCount"].rolling(30).mean()
fig = px.line(date_df, x="Date", y="rolling")
fig.update_xaxes(nticks=20)
fig.show()
Top 10
msg_df['Time'].value_counts().head(10).plot.barh()
plt.xlabel('Number of messages')
plt.ylabel('Time')
msg_df['Date'].value_counts().head(10).plot.barh()
print(msg_df['Date'].value_counts())
plt.xlabel('Number of Messages')
plt.ylabel('Date')